import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import prince
from scipy import stats
import os
from ydata_profiling import ProfileReport
# UVa\ds6001\mod10\live\ahs_cleaned-1.csv
ahs = pd.read_csv('ahs_cleaned-1.csv', na_values=[-6, "'-9'"])
ahs.head(2).T
| 0 | 1 | |
|---|---|---|
| Unnamed: 0 | 0 | 1 |
| DIVISION | South Atlantic | New England |
| TENURE | Owned or being bought by someone in your house... | Owned or being bought by someone in your house... |
| YRBUILT | 2000 | 1970 |
| UNITSIZE | 2,000 to 2,499 square feet | 3,000 to 3,999 square feet |
| HSHLDTYPE | Married-couple family household | Nonfamily household |
| HHRACE | White only | White only |
| HHSEX | Male | Female |
| HINCP | 257000.0 | 201000.0 |
| TOTHCAMT | 1642.0 | 1049.0 |
| MARKETVAL | 280249.0 | 1000270.0 |
| MAINTAMT | 1022.0 | 295.0 |
| FUSEBLOW | No fuses / breakers blown in the last 3 months | No fuses / breakers blown in the last 3 months |
| SEWBREAK | No breakdowns in the last 3 months | No breakdowns in the last 3 months |
| ROACH | No signs in the last 12 months | No signs in the last 12 months |
| RODENT | No signs in the last 12 months | No signs in the last 12 months |
| NOWIRE | Not broken | Not broken |
| PLUGS | Not broken | Not broken |
| COLD | Not broken | Not broken |
| NOTOIL | Not broken | Not broken |
| NOWAT | Not broken | Not broken |
| FLOORHOLE | Not broken | Not broken |
| FNDCRUMB | Not broken | Not broken |
| PAINTPEEL | Not broken | Not broken |
| ROOFHOLE | Not broken | Not broken |
| ROOFSAG | Not broken | Not broken |
| ROOFSHIN | Not broken | Not broken |
| WALLCRACK | Not broken | Not broken |
| WALLSIDE | Not broken | Not broken |
| WALLSLOPE | Not broken | Not broken |
| WINBOARD | Not broken | Not broken |
| WINBROKE | Not broken | Not broken |
| LEAKI | Not broken | Not broken |
| MOLDBATH | Not broken | Not broken |
ahs
| Unnamed: 0 | DIVISION | TENURE | YRBUILT | UNITSIZE | HSHLDTYPE | HHRACE | HHSEX | HINCP | TOTHCAMT | ... | ROOFHOLE | ROOFSAG | ROOFSHIN | WALLCRACK | WALLSIDE | WALLSLOPE | WINBOARD | WINBROKE | LEAKI | MOLDBATH | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | South Atlantic | Owned or being bought by someone in your house... | 2000 | 2,000 to 2,499 square feet | Married-couple family household | White only | Male | 257000.0 | 1642.0 | ... | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken |
| 1 | 1 | New England | Owned or being bought by someone in your house... | 1970 | 3,000 to 3,999 square feet | Nonfamily household | White only | Female | 201000.0 | 1049.0 | ... | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken |
| 2 | 2 | West South Central | NaN | 1970 | 750 to 999 square feet | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | Broken | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 3 | West South Central | Owned or being bought by someone in your house... | 1970 | 2,000 to 2,499 square feet | Married-couple family household | White only | Male | 66900.0 | 671.0 | ... | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken |
| 4 | 4 | West North Central | Rented | 1970 | 750 to 999 square feet | Nonfamily household | Black only | Female | 35000.0 | 680.0 | ... | NaN | NaN | NaN | Not broken | NaN | NaN | NaN | NaN | Not broken | Not broken |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 63180 | 63180 | East North Central | Owned or being bought by someone in your house... | 2016 | 4,000 square feet or more | Nonfamily household | White only | Male | 74000.0 | 6171.0 | ... | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken |
| 63181 | 63181 | South Atlantic | Owned or being bought by someone in your house... | 2018 | 1,500 to 1,999 square feet | Married-couple family household | White only | Male | 207000.0 | 2520.0 | ... | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken |
| 63182 | 63182 | South Atlantic | Owned or being bought by someone in your house... | 2018 | 2,000 to 2,499 square feet | Married-couple family household | White only | Female | 158100.0 | 1896.0 | ... | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken |
| 63183 | 63183 | South Atlantic | Owned or being bought by someone in your house... | 2018 | 2,500 to 2,999 square feet | Married-couple family household | White only | Male | 130200.0 | 2008.0 | ... | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken |
| 63184 | 63184 | South Atlantic | Owned or being bought by someone in your house... | 2016 | 3,000 to 3,999 square feet | Married-couple family household | White only | Female | 120000.0 | 2122.0 | ... | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken |
63185 rows × 34 columns
ahs = ahs.iloc[:,1:]
ahs.index.name = 'row_id'
ahs
| DIVISION | TENURE | YRBUILT | UNITSIZE | HSHLDTYPE | HHRACE | HHSEX | HINCP | TOTHCAMT | MARKETVAL | ... | ROOFHOLE | ROOFSAG | ROOFSHIN | WALLCRACK | WALLSIDE | WALLSLOPE | WINBOARD | WINBROKE | LEAKI | MOLDBATH | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| row_id | |||||||||||||||||||||
| 0 | South Atlantic | Owned or being bought by someone in your house... | 2000 | 2,000 to 2,499 square feet | Married-couple family household | White only | Male | 257000.0 | 1642.0 | 280249.0 | ... | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken |
| 1 | New England | Owned or being bought by someone in your house... | 1970 | 3,000 to 3,999 square feet | Nonfamily household | White only | Female | 201000.0 | 1049.0 | 1000270.0 | ... | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken |
| 2 | West South Central | NaN | 1970 | 750 to 999 square feet | NaN | NaN | NaN | NaN | NaN | 96118.0 | ... | NaN | NaN | NaN | Broken | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | West South Central | Owned or being bought by someone in your house... | 1970 | 2,000 to 2,499 square feet | Married-couple family household | White only | Male | 66900.0 | 671.0 | 198510.0 | ... | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken |
| 4 | West North Central | Rented | 1970 | 750 to 999 square feet | Nonfamily household | Black only | Female | 35000.0 | 680.0 | NaN | ... | NaN | NaN | NaN | Not broken | NaN | NaN | NaN | NaN | Not broken | Not broken |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 63180 | East North Central | Owned or being bought by someone in your house... | 2016 | 4,000 square feet or more | Nonfamily household | White only | Male | 74000.0 | 6171.0 | 925211.0 | ... | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken |
| 63181 | South Atlantic | Owned or being bought by someone in your house... | 2018 | 1,500 to 1,999 square feet | Married-couple family household | White only | Male | 207000.0 | 2520.0 | 372363.0 | ... | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken |
| 63182 | South Atlantic | Owned or being bought by someone in your house... | 2018 | 2,000 to 2,499 square feet | Married-couple family household | White only | Female | 158100.0 | 1896.0 | 257198.0 | ... | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken |
| 63183 | South Atlantic | Owned or being bought by someone in your house... | 2018 | 2,500 to 2,999 square feet | Married-couple family household | White only | Male | 130200.0 | 2008.0 | 510021.0 | ... | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken |
| 63184 | South Atlantic | Owned or being bought by someone in your house... | 2016 | 3,000 to 3,999 square feet | Married-couple family household | White only | Female | 120000.0 | 2122.0 | 440376.0 | ... | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken | Not broken |
63185 rows × 33 columns
ahs.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 63185 entries, 0 to 63184 Data columns (total 33 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DIVISION 63185 non-null object 1 TENURE 54455 non-null object 2 YRBUILT 63185 non-null int64 3 UNITSIZE 57629 non-null object 4 HSHLDTYPE 54455 non-null object 5 HHRACE 54455 non-null object 6 HHSEX 54455 non-null object 7 HINCP 54455 non-null float64 8 TOTHCAMT 54455 non-null float64 9 MARKETVAL 38390 non-null float64 10 MAINTAMT 32972 non-null float64 11 FUSEBLOW 54435 non-null object 12 SEWBREAK 54355 non-null object 13 ROACH 54455 non-null object 14 RODENT 54455 non-null object 15 NOWIRE 63035 non-null object 16 PLUGS 63035 non-null object 17 COLD 50479 non-null object 18 NOTOIL 54417 non-null object 19 NOWAT 53731 non-null object 20 FLOORHOLE 63185 non-null object 21 FNDCRUMB 41861 non-null object 22 PAINTPEEL 63185 non-null object 23 ROOFHOLE 41939 non-null object 24 ROOFSAG 42094 non-null object 25 ROOFSHIN 41956 non-null object 26 WALLCRACK 63185 non-null object 27 WALLSIDE 42168 non-null object 28 WALLSLOPE 42202 non-null object 29 WINBOARD 42373 non-null object 30 WINBROKE 42339 non-null object 31 LEAKI 54455 non-null object 32 MOLDBATH 53820 non-null object dtypes: float64(4), int64(1), object(28) memory usage: 15.9+ MB
ahs.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| YRBUILT | 63185.0 | 1970.509646 | 26.429845 | 1919.0 | 1950.0 | 1970.0 | 1990.00 | 2019.0 |
| HINCP | 54455.0 | 87066.124176 | 100064.851607 | -5000.0 | 27500.0 | 60000.0 | 111000.00 | 3876000.0 |
| TOTHCAMT | 54455.0 | 1517.628739 | 1783.335753 | 0.0 | 670.0 | 1164.0 | 1892.50 | 100700.0 |
| MARKETVAL | 38390.0 | 376276.939750 | 553786.639374 | 1000.0 | 140446.5 | 255273.0 | 435968.25 | 9999998.0 |
| MAINTAMT | 32972.0 | 874.907710 | 1357.366635 | -9.0 | 2.0 | 460.5 | 1016.00 | 9998.0 |
profile = ProfileReport(ahs, title='AHS Profiling Report', html={'style':{'full_width':True}})
profile.to_notebook_iframe()
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
ahs['RODENT'].value_counts()
No signs in the last 12 months 48821 Seen a few times in the last 12 months 4212 Seen monthly in the last 12 months 522 Seen daily in the last 12 months 474 Seen weekly in the last 12 months 426 Name: RODENT, dtype: int64
%matplotlib inline
ahs['RODENT'].value_counts().plot()
<AxesSubplot: >
stats.f_oneway(
ahs.query("RODENT == 'No signs in the last 12 months'").HINCP.dropna(),
ahs.query("RODENT == 'Seen a few times in the last 12 months'").HINCP.dropna(),
ahs.query("RODENT == 'Seen daily in the last 12 months'").HINCP.dropna(),
ahs.query("RODENT == 'Seen monthly in the last 12 months'").HINCP.dropna(),
ahs.query("RODENT == 'Seen weekly in the last 12 months'").HINCP.dropna(),
)
F_onewayResult(statistic=21.68467615110672, pvalue=6.703833330074091e-18)
ahs.groupby("RODENT").HINCP.mean().sort_values(ascending=False).to_frame('mean_inc').style.bar()
| mean_inc | |
|---|---|
| RODENT | |
| No signs in the last 12 months | 87738.246779 |
| Seen a few times in the last 12 months | 86156.387464 |
| Seen monthly in the last 12 months | 82798.544061 |
| Seen weekly in the last 12 months | 64086.826291 |
| Seen daily in the last 12 months | 51274.924051 |
ahs[['MARKETVAL', 'YRBUILT' ]].corr()
| MARKETVAL | YRBUILT | |
|---|---|---|
| MARKETVAL | 1.00000 | -0.00403 |
| YRBUILT | -0.00403 | 1.00000 |
ahs[['MARKETVAL', 'YRBUILT' ]].corr('kendall')
| MARKETVAL | YRBUILT | |
|---|---|---|
| MARKETVAL | 1.000000 | 0.083796 |
| YRBUILT | 0.083796 | 1.000000 |
ahs2 = ahs[['MARKETVAL', 'YRBUILT' ]].dropna()
stats.pearsonr(ahs2.MARKETVAL, ahs2.YRBUILT)
(-0.004029500232993765, 0.4298243664197942)
broken = ahs[['FUSEBLOW','ROACH','RODENT','NOWIRE','PLUGS']].dropna()
broken.shape
(54435, 5)
# sample 5000 rows from broken
broken = broken.sample(4000, random_state=42)
prince_mca = prince.MCA(n_components=2)
prince_mca = prince_mca.fit(broken)
#gss_mca = prince_mca.transform(gss_cat)
prince_mca.row_coordinates(broken)
| 0 | 1 | |
|---|---|---|
| row_id | ||
| 37340 | 0.342858 | 0.115637 |
| 26264 | 1.115773 | -1.465172 |
| 43884 | -0.175239 | -0.011974 |
| 35036 | -0.175239 | -0.011974 |
| 9002 | 0.118970 | 0.677848 |
| ... | ... | ... |
| 59549 | -0.175239 | -0.011974 |
| 23758 | -0.111287 | 0.235141 |
| 6106 | -0.175239 | -0.011974 |
| 36339 | -0.111287 | 0.235141 |
| 52994 | -0.175239 | -0.011974 |
4000 rows × 2 columns
plt = prince_mca.plot(
broken,
x_component=0,
y_component=1
)
plt